# -*- coding: utf-8 -*-
import logging
from scrapy import signals
from traceback import format_exc
from pymongo import UpdateOne
from zc_core.client.mongo_client import Mongo

logger = logging.getLogger(__name__)


class CatalogTreeExtension(object):

    @classmethod
    def from_crawler(cls, crawler):
        ext = cls()
        crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
        return ext

    def spider_closed(self, spider):
        # 记录批次完成状态
        try:
            if spider.name and spider.name in ['catalog']:
                if spider.batch_no:
                    batch_no = spider.batch_no
                    self.update_catalog_tree(batch_no, 'catalog_pool')
                    self.update_catalog_tree(batch_no, 'cat_{}'.format(batch_no))
        except Exception as e:
            _ = e
            spider.logger.error(format_exc())

    # 更新分类父子信息
    def update_catalog_tree(self, batch_no, coll_name):
        mongo = Mongo(batch_no=batch_no)
        coll = mongo.get_collection(coll_name)
        if coll:
            sorted_pool = coll.find().sort("level")
            if sorted_pool:
                cat_map = {}
                bulk_list = list()
                for row in sorted_pool:
                    cat_id = row.get('_id')
                    cat_id_trim = row.get('catalogIdTrim')
                    level = row.get('level')
                    cat_map[cat_id_trim] = cat_id
                    upper_cat_code = cat_id_trim[0:-2]
                    upper_cat_id = cat_map.get(upper_cat_code)
                    if upper_cat_id:
                        row['parentId'] = upper_cat_id
                    if level == 1:
                        row['parentId'] = ''
                    bulk_list.append(UpdateOne({'_id': row.get("_id")}, {'$set': row}, upsert=False))

                if bulk_list:
                    mongo.bulk_write(coll_name, bulk_list)
                    logger.info("更新分类结构: coll=%s, cnt=%s" % (coll_name, len(bulk_list)))


if __name__ == '__main__':
    # print('16010415'[0:-2])
    CatalogTreeExtension().update_catalog_tree('20200213', 'catalog_pool')
